library(sparklyr)
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
sc <- spark_connect(master = "local", version = "2.4.5")
library(tidyverse)
── Attaching packages ────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.0 ✓ purrr 0.3.4
✓ tibble 3.0.1 ✓ dplyr 0.8.5
✓ tidyr 1.1.0 ✓ stringr 1.4.0
✓ readr 1.3.1 ✓ forcats 0.5.0
── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x purrr::invoke() masks sparklyr::invoke()
x dplyr::lag() masks stats::lag()
profiles <- spark_read_csv(
sc,
"data/profiles.csv",
escape = "\"",
memory = FALSE,
options = list(multiline = TRUE)
)
glimpse(profiles)
Rows: ??
Columns: 31
Database: spark_connection
$ age <int> 47, 27, 45, 40, 33, 27, 20, 28, 24, 34, 32, 23, 19, 45, 32, 38, 36, 25, …
$ body_type <chr> "athletic", "full figured", NA, "athletic", "athletic", "thin", "curvy",…
$ diet <chr> "mostly anything", "mostly vegetarian", NA, NA, "mostly anything", "most…
$ drinks <chr> "socially", "socially", "socially", "socially", "rarely", "socially", "s…
$ drugs <chr> "never", NA, "never", "never", "never", "sometimes", "never", "never", "…
$ education <chr> "graduated from college/university", "graduated from college/university"…
$ essay0 <chr> NA, "still figuring out what to put here... for now i'll leave you with\…
$ essay1 <chr> "working in a creative industry (brand marketing agency), helping to\nra…
$ essay2 <chr> "listening to people, hearing them and working to help them. it's\nwhat …
$ essay3 <chr> "my million dollar smile (self-deprecating humor).", "my smile, my energ…
$ essay4 <chr> "sadly, my reading is confined to newspapers and business\nperiodicals. …
$ essay5 <chr> "my daughter. usa today. exercise. some travel. world news. ice\ncream."…
$ essay6 <chr> NA, NA, "why i fill these blocks.<br />\nwhat i really, really want.<br …
$ essay7 <chr> "picking up my daughter for the weekend or going to the gym and\ncoming …
$ essay8 <chr> "i can be a procrastinator but it is often driven by my need to\nresearc…
$ essay9 <chr> "you are looking for an easy going, no drama, try anything once\namigo t…
$ ethnicity <chr> "white", "white", "other", "white", "white", "hispanic / latin, white", …
$ height <int> 72, 63, 66, 62, 74, 71, 68, 66, 62, 67, 61, 59, 68, 68, 67, 73, 75, 67, …
$ income <int> -1, -1, -1, -1, -1, -1, 20000, -1, -1, -1, 70000, -1, -1, -1, -1, 70000,…
$ job <chr> "sales / marketing / biz dev", "science / tech / engineering", "medicine…
$ last_online <chr> "2012-06-29-22-37", "2012-06-30-12-50", "2012-06-30-18-21", "2012-06-30-…
$ location <chr> "san carlos, california", "oakland, california", "san francisco, califor…
$ offspring <chr> "has a kid, but doesn’t want more", "doesn’t have kids", NA,…
$ orientation <chr> "straight", "straight", "straight", "straight", "gay", "gay", "straight"…
$ pets <chr> "likes dogs", "likes dogs", "likes dogs and likes cats", "has dogs", NA,…
$ religion <chr> "other", "atheism but not too serious about it", "catholicism but not to…
$ sex <chr> "m", "f", "m", "f", "m", "m", "f", "f", "f", "f", "f", "f", "m", "m", "f…
$ sign <chr> "scorpio and it’s fun to think about", "sagittarius but it doesn&r…
$ smokes <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", "yes",…
$ speaks <chr> "english", "english", "english (fluently)", "english", "english", "engli…
$ status <chr> "single", "single", "single", "single", "single", "single", "single", "s…
library(dplyr)
profiles %>%
summarise_all(.funs = ~ sum(as.integer(is.na(.))))
Missing values are always removed in SQL.
Use `SUM(x, na.rm = TRUE)` to silence this warning
This warning is displayed only once per session.
profiles %>%
summarise(num_neg_income = sum(as.integer(income < 0)))
profiles_char <- profiles %>%
select(-c(age, income, height)) %>%
mutate_all(~ ifelse(is.na(.), "missing", .))
profiles_num <- profiles %>%
select(age, income, height) %>%
mutate(
age = as.numeric(age),
income = ifelse(income == "-1", NA, as.numeric(income)),
height = as.numeric(height)
)
profiles <- sdf_bind_cols(profiles_char, profiles_num) %>%
compute("profiles")
profiles %>%
summarise_all(~sum(as.integer(is.na(.))))
Missing values are always removed in SQL.
Use `SUM(x, na.rm = TRUE)` to silence this warning
This warning is displayed only once per session.
profiles %>%
summarise(num_neg_income = sum(as.integer(income < 0)))
glimpse(profiles)
Rows: ??
Columns: 31
Database: spark_connection
$ body_type <chr> "thin", "curvy", "average", "fit", "athletic", "curvy", "fit", "average"…
$ diet <chr> "mostly anything", "mostly anything", "strictly anything", "anything", "…
$ drinks <chr> "socially", "socially", "often", "socially", "often", "socially", "socia…
$ drugs <chr> "sometimes", "never", "sometimes", "never", "sometimes", "missing", "som…
$ education <chr> "working on masters program", "working on college/university", "working …
$ essay0 <chr> "grad student at berkeley. i am moving to sf pretty soon and wanna\nmeet…
$ essay1 <chr> "doing my mba at berkeley and working part time. i also try to swim\neve…
$ essay2 <chr> "missing", "music. english. chemistry. dancing.", "drinking! .... i thin…
$ essay3 <chr> "i look younger than i am..", "my smile, because i am pretty much always…
$ essay4 <chr> "music: wow where do i start? radiohead, led zeppelin, pink floyd,\nphoe…
$ essay5 <chr> "- inspiring people<br />\n- swimming<br />\n- great food<br />\n- great…
$ essay6 <chr> "- how crazy life is<br />\n- finding a guy just like you (hopefully) (:…
$ essay7 <chr> "super happy", "either reading a book or seeing a live show at a local c…
$ essay8 <chr> "missing", "i can honestly say this, im a cryer.<br />\ni have the emoti…
$ essay9 <chr> "you want to meet me one of these days..", "you like a goofy sense of hu…
$ ethnicity <chr> "hispanic / latin, white", "hispanic / latin", "white", "hispanic / lati…
$ job <chr> "sales / marketing / biz dev", "hospitality / travel", "other", "artisti…
$ last_online <chr> "2012-06-23-08-25", "2012-04-19-12-59", "2012-01-11-22-11", "2012-06-30-…
$ location <chr> "berkeley, california", "richmond, california", "burlingame, california"…
$ offspring <chr> "doesn’t have kids", "doesn’t want kids", "doesn’t wan…
$ orientation <chr> "gay", "straight", "straight", "gay", "straight", "bisexual", "straight"…
$ pets <chr> "missing", "has dogs and dislikes cats", "has cats", "likes dogs", "has …
$ religion <chr> "agnosticism", "agnosticism", "missing", "catholicism but not too seriou…
$ sex <chr> "m", "f", "f", "m", "m", "f", "f", "m", "m", "m", "m", "f", "m", "m", "m…
$ sign <chr> "libra", "leo", "aquarius but it doesn’t matter", "scorpio and it&…
$ smokes <chr> "no", "no", "yes", "when drinking", "no", "no", "no", "when drinking", "…
$ speaks <chr> "english (fluently), portuguese (fluently), spanish (okay), french (okay…
$ status <chr> "single", "single", "single", "available", "single", "single", "single",…
$ age <dbl> 27, 20, 23, 19, 45, 19, 24, 48, 24, 26, 41, 33, 27, 55, 22, 26, 30, 26, …
$ income <dbl> NaN, 2e+04, NaN, NaN, NaN, NaN, 4e+04, NaN, NaN, NaN, NaN, NaN, NaN, NaN…
$ height <dbl> 71, 68, 59, 68, 68, 62, 65, 67, 72, 66, 67, 68, 68, 70, 66, 95, 74, 73, …
Now add in the response variable:
profiles <- profiles %>%
mutate(
not_working = as.integer(ifelse(job %in% c("student", "unemployed", "retired"), 1, 0))
)
profiles %>%
count(not_working)
The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
Please use the `.add` argument instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
profiles %>%
filter(not_working == "1") %>%
glimpse()
Rows: ??
Columns: 32
Database: spark_connection
$ body_type <chr> "curvy", "athletic", "missing", "fit", "fit", "athletic", "missing", "av…
$ diet <chr> "mostly vegetarian", "missing", "mostly anything", "mostly anything", "m…
$ drinks <chr> "socially", "socially", "not at all", "socially", "socially", "socially"…
$ drugs <chr> "missing", "never", "never", "never", "never", "never", "missing", "neve…
$ education <chr> "working on college/university", "working on two-year college", "working…
$ essay0 <chr> "hello, gentlemen! i am a passionate little minx with a genuine lust\nfo…
$ essay1 <chr> "right now i am in university with a joint major in apparel design\nand …
$ essay2 <chr> "giving hugs<br />\nfigure drawing<br />\nde-awkwardifying situations<br…
$ essay3 <chr> "bright blue eyes, petite, dd cup, silly, bubbly, smiling, and a\nlight<…
$ essay4 <chr> "music: the specials, operation ivy, bad brains, the clash, choking\nvic…
$ essay5 <chr> "self-respect<br />\npure, honest human connection<br />\nsexy time<br /…
$ essay6 <chr> "past sexual encounters while in boring/mundane situations (people\nwond…
$ essay7 <chr> "hanging out with my roomie/best friend, being silly, cracking\njokes, h…
$ essay8 <chr> "i get sentimental about entries sent to postsecret.com. i hope to\nsend…
$ essay9 <chr> "you're kind, silly, fairly intelligent, don't take yourself too\nseriou…
$ ethnicity <chr> "hispanic / latin, other", "asian", "asian", "white", "white", "white", …
$ job <chr> "student", "student", "student", "student", "student", "retired", "stude…
$ last_online <chr> "2012-07-01-02-33", "2012-06-27-19-12", "2012-03-26-23-53", "2012-06-30-…
$ location <chr> "san francisco, california", "redwood city, california", "moraga, califo…
$ offspring <chr> "doesn’t want kids", "doesn’t have kids, and doesn’t w…
$ orientation <chr> "bisexual", "straight", "straight", "bisexual", "straight", "straight", …
$ pets <chr> "likes dogs and likes cats", "has dogs and likes cats", "has dogs", "lik…
$ religion <chr> "atheism and laughing about it", "buddhism but not too serious about it"…
$ sex <chr> "f", "f", "f", "m", "m", "f", "m", "m", "m", "f", "f", "f", "f", "f", "m…
$ sign <chr> "virgo but it doesn’t matter", "scorpio and it’s fun to thin…
$ smokes <chr> "no", "sometimes", "no", "no", "no", "no", "no", "sometimes", "no", "no"…
$ speaks <chr> "english (fluently), spanish (okay)", "english (fluently), japanese (poo…
$ status <chr> "single", "single", "single", "single", "single", "single", "single", "s…
$ age <dbl> 19, 20, 20, 22, 22, 68, 23, 25, 34, 26, 34, 25, 23, 23, 25, 21, 23, 20, …
$ income <dbl> NaN, NaN, NaN, 20000, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, …
$ height <dbl> 62, 63, 63, 76, 73, 69, 74, 71, 73, 68, 63, 65, 68, 67, 65, 67, 62, 66, …
$ not_working <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
sdf_describe(profiles, cols = c("age", "height", "income", "not_working"))
splits <- profiles %>%
sdf_random_split(training = 0.8, testing = 0.2)
training <- splits$training %>%
compute("training")
testing <- splits$testing %>%
compute("testing")
sdf_describe(training, cols = c("age", "height", "income", "not_working"))
sdf_describe(testing, cols = c("age", "height", "income", "not_working"))
We basically want the first word of the religion variable
training <- training %>%
mutate(religion = regexp_extract(religion, "[a-zA-Z]+", 0))
training %>%
count(religion, not_working) %>%
group_by(religion) %>%
summarise(
count = sum(n),
prop_not_working = sum(not_working * n) / sum(n)
)
We suspect that people less likely to regularly drink are also less likely to regularly use drugs, let’s confirm this with a cross-tabulation (also known as a contingency table):
contingency <- training %>%
sdf_crosstab("drinks", "drugs") %>%
collect()
contingency
The contingency table is nice, but it would be better to order the levels of drinks and drugs to aid comparisons:
contingency %>%
rename(drinks = drinks_drugs) %>%
mutate(
drinks = as_factor(drinks) %>%
fct_relevel("missing", "not at all", "rarely", "socially", "often",
"very often", "desperately")
) %>%
arrange(drinks) %>%
select(drinks, missing, never, sometimes, often)
Ethnicity requires a bit of work: respondents on OKCupid were permitted to make multiple selections of ethnicity, and so we end up with potentially multiple comma-separated strings. Let’s have a look at possible values (we will use Hive string-handling functions split() and trim() together with explode() to do this):
ethnicities <- profiles %>%
select(ethnicity) %>%
mutate(ethnicity = explode(split(ethnicity, ","))) %>%
mutate(ethnicity = trim(ethnicity)) %>%
distinct(ethnicity) %>%
collect()
ethnicities <- ethnicities$ethnicity
ethnicities
[1] "pacific islander" "native american" "white" "missing"
[5] "black" "middle eastern" "indian" "other"
[9] "hispanic / latin" "asian"
Now we can use a bit of tidy-evaluation to create dummy columns corresponding to the possible ethnicities. First, we drop missing from the vector, as missing will correspond to zeroes all across the ethnicity dummies.
ethnicities <- setdiff(ethnicities, "missing")
ethnicities
[1] "pacific islander" "native american" "white" "black"
[5] "middle eastern" "indian" "other" "hispanic / latin"
[9] "asian"
ethnicity_vars <- ethnicities %>%
purrr::map(~ expr(ifelse(like(ethnicity, !!.x), 1, 0))) %>%
purrr::set_names(paste0("ethnicity_", gsub("\\s|/", "", ethnicities)))
training <- mutate(training, !!!ethnicity_vars) %>%
compute("training")
glimpse(training)
Rows: ??
Columns: 41
Database: spark_connection
$ body_type <chr> "a little extra", "a little extra", "a little extra", "a l…
$ diet <chr> "anything", "anything", "anything", "anything", "anything"…
$ drinks <chr> "often", "rarely", "socially", "socially", "socially", "so…
$ drugs <chr> "missing", "never", "missing", "never", "never", "never", …
$ education <chr> "missing", "working on ph.d program", "graduated from coll…
$ essay0 <chr> "im an impulsive creature i love life and having all types…
$ essay1 <chr> "missing", "missing", "college graduate, i'm paying off my…
$ essay2 <chr> "cooking i dont cook that often but when i do angels sing …
$ essay3 <chr> "that im big and scary and that im alot nicer than i look"…
$ essay4 <chr> "my favorite books are the wheel of time and sword of trut…
$ essay5 <chr> "i could never do without friends, family, love,.self resp…
$ essay6 <chr> "the end of the world, dragons,being a ninja,ruling the wo…
$ essay7 <chr> "the same thing i do every night... try to take over the w…
$ essay8 <chr> "i believe in ghosts", "missing", "i may have a slight obs…
$ essay9 <chr> "you want to have fun", "missing", "you're down to earth, …
$ ethnicity <chr> "native american, hispanic / latin, white", "hispanic / la…
$ job <chr> "sales / marketing / biz dev", "science / tech / engineeri…
$ last_online <chr> "2012-06-29-09-52", "2012-06-29-18-35", "2012-06-19-18-54"…
$ location <chr> "san rafael, california", "berkeley, california", "rodeo, …
$ offspring <chr> "missing", "missing", "doesn’t have kids, but might …
$ orientation <chr> "straight", "straight", "straight", "bisexual", "straight"…
$ pets <chr> "missing", "missing", "has dogs and likes cats", "likes do…
$ religion <chr> "christianity", "catholicism", "catholicism", "other", "mi…
$ sex <chr> "m", "m", "m", "m", "f", "m", "m", "m", "f", "m", "f", "m"…
$ sign <chr> "pisces and it’s fun to think about", "missing", "aq…
$ smokes <chr> "yes", "no", "when drinking", "no", "missing", "sometimes"…
$ speaks <chr> "english", "english, spanish", "english (fluently), tagalo…
$ status <chr> "single", "single", "single", "available", "single", "sing…
$ age <dbl> 29, 29, 25, 45, 26, 20, 20, 26, 32, 26, 31, 36, 34, 36, 33…
$ income <dbl> 3e+04, NaN, 2e+04, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN,…
$ height <dbl> 73, 74, 63, 67, 63, 68, 75, 66, 61, 72, 62, 66, 73, 69, 64…
$ not_working <int> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_pacificislander <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_nativeamerican <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_white <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1…
$ ethnicity_black <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ethnicity_middleeastern <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_indian <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_other <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_hispaniclatin <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_asian <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
Let’s also get the length of the essays written
training <- training %>%
mutate(
essay_length = char_length(paste(!!!syms(paste0("essay", 0:9))))
) %>%
compute("training")
glimpse(training)
Rows: ??
Columns: 42
Database: spark_connection
$ body_type <chr> "a little extra", "a little extra", "a little extra", "a l…
$ diet <chr> "anything", "anything", "anything", "anything", "anything"…
$ drinks <chr> "often", "rarely", "socially", "socially", "socially", "so…
$ drugs <chr> "missing", "never", "missing", "never", "never", "never", …
$ education <chr> "missing", "working on ph.d program", "graduated from coll…
$ essay0 <chr> "im an impulsive creature i love life and having all types…
$ essay1 <chr> "missing", "missing", "college graduate, i'm paying off my…
$ essay2 <chr> "cooking i dont cook that often but when i do angels sing …
$ essay3 <chr> "that im big and scary and that im alot nicer than i look"…
$ essay4 <chr> "my favorite books are the wheel of time and sword of trut…
$ essay5 <chr> "i could never do without friends, family, love,.self resp…
$ essay6 <chr> "the end of the world, dragons,being a ninja,ruling the wo…
$ essay7 <chr> "the same thing i do every night... try to take over the w…
$ essay8 <chr> "i believe in ghosts", "missing", "i may have a slight obs…
$ essay9 <chr> "you want to have fun", "missing", "you're down to earth, …
$ ethnicity <chr> "native american, hispanic / latin, white", "hispanic / la…
$ job <chr> "sales / marketing / biz dev", "science / tech / engineeri…
$ last_online <chr> "2012-06-29-09-52", "2012-06-29-18-35", "2012-06-19-18-54"…
$ location <chr> "san rafael, california", "berkeley, california", "rodeo, …
$ offspring <chr> "missing", "missing", "doesn’t have kids, but might …
$ orientation <chr> "straight", "straight", "straight", "bisexual", "straight"…
$ pets <chr> "missing", "missing", "has dogs and likes cats", "likes do…
$ religion <chr> "christianity", "catholicism", "catholicism", "other", "mi…
$ sex <chr> "m", "m", "m", "m", "f", "m", "m", "m", "f", "m", "f", "m"…
$ sign <chr> "pisces and it’s fun to think about", "missing", "aq…
$ smokes <chr> "yes", "no", "when drinking", "no", "missing", "sometimes"…
$ speaks <chr> "english", "english, spanish", "english (fluently), tagalo…
$ status <chr> "single", "single", "single", "available", "single", "sing…
$ age <dbl> 29, 29, 25, 45, 26, 20, 20, 26, 32, 26, 31, 36, 34, 36, 33…
$ income <dbl> 3e+04, NaN, 2e+04, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN,…
$ height <dbl> 73, 74, 63, 67, 63, 68, 75, 66, 61, 72, 62, 66, 73, 69, 64…
$ not_working <int> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_pacificislander <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_nativeamerican <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_white <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1…
$ ethnicity_black <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ethnicity_middleeastern <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_indian <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_other <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_hispaniclatin <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_asian <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ essay_length <int> 892, 79, 2570, 4240, 2945, 2055, 2746, 1837, 2681, 560, 97…
Let’s see how to scale a variable manually:
scale_age_values <- training %>%
summarize(
mean_age = mean(age),
sd_age = sd(age)
) %>%
collect()
Missing values are always removed in SQL.
Use `mean(x, na.rm = TRUE)` to silence this warning
This warning is displayed only once per session.
scale_age_values
training <- training %>%
mutate(
scaled_age = (age - !!scale_age_values$mean_age) / !!scale_age_values$sd_age
)
training
lr <- ml_logistic_regression(
training, not_working ~ scaled_age + sex + drinks + drugs + essay_length + status + diet + education
)
validation_info <- ml_evaluate(lr, training)
validation_info
BinaryLogisticRegressionSummaryImpl
Access the following via `$` or `ml_summary()`.
- features_col()
- label_col()
- predictions()
- probability_col()
- area_under_roc()
- f_measure_by_threshold()
- pr()
- precision_by_threshold()
- recall_by_threshold()
- roc()
- prediction_col()
- accuracy()
- f_measure_by_label()
- false_positive_rate_by_label()
- labels()
- precision_by_label()
- recall_by_label()
- true_positive_rate_by_label()
- weighted_f_measure()
- weighted_false_positive_rate()
- weighted_precision()
- weighted_recall()
- weighted_true_positive_rate()
roc <- validation_info$roc() %>%
collect()
ggplot(roc, aes(x = FPR, y = TPR)) +
geom_line() + geom_abline(lty = "dashed") +
coord_fixed()
validation_info$area_under_roc()
[1] 0.875979
scale_age_values <- testing %>%
summarize(
mean_age = mean(age),
sd_age = sd(age)
) %>%
collect()
testing <- testing %>%
mutate(
essay_length = char_length(paste(!!!syms(paste0("essay", 0:9)))),
scaled_age = (age - !!scale_age_values$mean_age) / !!scale_age_values$sd_age
) %>%
compute("testing")
validation_info_test <- ml_evaluate(lr, testing)
validation_info_test
BinaryLogisticRegressionSummaryImpl
Access the following via `$` or `ml_summary()`.
- features_col()
- label_col()
- predictions()
- probability_col()
- area_under_roc()
- f_measure_by_threshold()
- pr()
- precision_by_threshold()
- recall_by_threshold()
- roc()
- prediction_col()
- accuracy()
- f_measure_by_label()
- false_positive_rate_by_label()
- labels()
- precision_by_label()
- recall_by_label()
- true_positive_rate_by_label()
- weighted_f_measure()
- weighted_false_positive_rate()
- weighted_precision()
- weighted_recall()
- weighted_true_positive_rate()
roc <- validation_info_test$roc() %>%
collect()
ggplot(roc, aes(x = FPR, y = TPR)) +
geom_line() + geom_abline(lty = "dashed") +
coord_fixed()
validation_info_test$area_under_roc()
[1] 0.883113
spark_write_parquet(training, "training_data")